By conducting these analyses, we can gain insights into:

Loading Dataset

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_colwidth', 100)
# full dataset
df = pd.read_csv('data/bcas_dataset_fin.csv')
df.shape
(7216, 18)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7216 entries, 0 to 7215
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           7216 non-null   object
 1   date          7216 non-null   int64 
 2   views         7216 non-null   int64 
 3   downloads     7216 non-null   int64 
 4   author_cn     7209 non-null   object
 5   author_en     7216 non-null   object
 6   title_cn      7216 non-null   object
 7   title_en      3384 non-null   object
 8   org_cn        4104 non-null   object
 9   org_en        2150 non-null   object
 10  abstract_cn   4750 non-null   object
 11  abstract_en   2233 non-null   object
 12  keywords_cn   5080 non-null   object
 13  keywords_en   2145 non-null   object
 14  fund_project  1054 non-null   object
 15  similar       7128 non-null   object
 16  issue         7216 non-null   object
 17  page          7216 non-null   object
dtypes: int64(3), object(15)
memory usage: 1014.9+ KB
df = df.rename(columns={'date': 'year'})

# filter out 2024 as it's not finished yet
df = df[df['year'] < 2024]
df.shape
(7048, 18)

Views & Downloads

fig = px.histogram(df, x="views", nbins=400)

fig.update_layout(
    layout,
    title='Distribution of Article Views',
    yaxis=dict(
        title="Count"
    ),
    xaxis=dict(
        title="Views",
        tickangle=0
    ),
)

fig.update_traces(
    marker_color='#0E86D4',
    hovertemplate='Views: %{x}<br>Count: %{y}<extra></extra>'
)

fig.show()
fig = px.scatter(
    df,
    x="views",
    y="downloads",
    log_x=False
)

fig.update_layout(
    layout,
    title='Relationship Between Article Views and Downloads',
    xaxis_title="Views",
    yaxis_title="Downloads"
)

fig.update_traces(
    marker_color='#0E86D4',
    hovertemplate='Views: %{x}<br>Downloads: %{y}<extra></extra>'
)

fig.show()
gp = df.groupby('year')[['views', 'downloads']].sum().reset_index()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=gp['year'],
    y=gp['views'],
    name='Views',
    line=dict(color='#0E86D4', width=3),
    hovertemplate='Year: %{x}<br>Views: %{y}<extra></extra>'
))

fig.add_trace(go.Scatter(
    x=gp['year'],
    y=gp['downloads'],
    name='Downloads',
    line=dict(color='#FF3131', width=3),
    hovertemplate='Year: %{x}<br>Downloads: %{y}<extra></extra>'

))

fig.update_layout(
    layout,
    width=900,
    title='Views and Downloads Trend, 1986-2023',
    xaxis=dict(
        type='category',
        tickmode='array',
        title="",
        minor=dict(ticks="inside", showgrid=True),
    ),
    yaxis=dict(title="Count"),

)

fig.show()
# calculate avg downloads and views per article for each year
gp = df.groupby('year')[['views', 'downloads']].sum().reset_index()
gp['articles'] = df.groupby('year').size().values
gp['views_avg'] = gp.views/gp.articles
gp['downloads_avg'] = gp.downloads/gp.articles

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=gp['year'],
    y=gp['views_avg'],
    name='Views per Article',
    line=dict(color='#0E86D4', width=3),
    hovertemplate='Year: %{x}<br>Views Avg: %{y}<extra></extra>'
))

fig.add_trace(go.Scatter(
    x=gp['year'],
    y=gp['downloads_avg'],
    name='Downloads per Article',
    line=dict(color='#FF3131', width=3),
    hovertemplate='Year: %{x}<br>Downloads Avg: %{y}<extra></extra>'
))

fig.update_layout(
    layout,
    width=950,
    title='Views and Downloads per Article Trend, 1986-2023',
    xaxis=dict(
        type='category',
        title="",
        minor=dict(ticks="inside", showgrid=True)
    ),
    yaxis=dict(
        title="Count",
    )
)

fig.show()

Organizations

General Statistics

orgs = pd.read_csv('data/orgs_flat.csv')
orgs = orgs[orgs['year'] < 2024]
gp = orgs.groupby('year')['orgs_head'].nunique().reset_index()

fig = px.line(gp, x='year', y='orgs_head', title='')

fig.update_layout(
    layout,
    title='Organization Count by Year, 1986-2023',
    xaxis=dict(
        type='category',
        minor=dict(ticks="inside", showgrid=True)
    ),
    yaxis=dict(
        title="Organizations"
    )
)

fig.update_traces(
    line_color='#0E86D4',
    line=dict(width=3),
    hovertemplate='Year: %{x}<br>Organizations: %{y}<extra></extra>'

)

fig.show()
article_total_year = df.groupby('year')['title_cn'].count().reset_index()
orgs_per_year = orgs.groupby('year')['orgs_head'].nunique().reset_index()
orgs_per_year['orgs_per_article'] = orgs_per_year['orgs_head'] / \
    article_total_year['title_cn']

fig = px.line(orgs_per_year, x='year', y='orgs_per_article', title='')

fig.update_layout(
    layout,
    xaxis=dict(
        type='category',
        minor=dict(ticks="inside", showgrid=True)
    ),
    yaxis=dict(
        title="Organizations per Article"
    )
)

fig.update_traces(
    line_color='#0E86D4',
    line=dict(width=3),
    hovertemplate='Year: %{x}<br>Organizations per Article: %{y:.2f}<extra></extra>'

)

fig.show()
# filter out entries outside of 2013-2023 period
orgs = orgs[(orgs['year'] > 2012) & (orgs['year'] < 2024)]
gp = orgs.orgs_head.value_counts().sort_values(
    ascending=False).head(10).reset_index()

# Create the bar chart
fig = px.bar(
    gp,
    x='count',
    y='orgs_head',
    orientation='h'
)

fig.update_layout(
    layout,
    width=1200,
    height=400,
    xaxis=dict(
        title="Frequency",
        range=[0, 500]
    ),
    yaxis=dict(
        autorange="reversed"
    ),
    title='Top 10 Organizations by Frequency, 2013-2023',
)

fig.update_traces(
    textposition='outside',
    texttemplate='%{x}',
    marker_color='#0E86D4',
    textfont=dict(color='black'),
    cliponaxis=True,
    opacity=0.7,
    hovertemplate='Organization: %{y}<br>Count: %{x}<extra></extra>'
)

fig.show()

Geospatial Analysis

orgs = orgs[(orgs['year'] > 2012) & (orgs['year'] < 2024)]
orgs.head()
url org_cn city_cn city_en org_cn_head orgs_head title_cn year
314 http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1 中国石油勘探开发研究院 北京 Beijing 中国石油勘探开发研究院 Research Institute of Petroleum Exploration and Development 油气安全战略与“双碳”战略:关系与路径 2023
315 http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230101&flag=1 国家油气战略研究中心 北京 Beijing 国家油气战略研究中心 National Oil and Gas Strategic Research Center 油气安全战略与“双碳”战略:关系与路径 2023
316 http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230102&flag=1 深部煤矿采动响应与灾害防控国家重点实验室 淮南 Huainan 深部煤矿采动响应与灾害防控国家重点实验室 State Key Laboratory of Mining Response and Disaster Prevention In Deep Coal Mines 我国煤炭主体能源安全高质量发展的理论技术思考 2023
317 http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230103&flag=1 中国煤炭科工集团有限公司 北京 Beijing 中国煤炭科工集团有限公司 China Coal Technology Engineering Group (Ccteg) 新形势下我国能源高质量发展与能源安全 2023
318 http://old2022.bulletin.cas.cn/zgkxyyk/ch/reader/view_abstract.aspx?file_no=20230103&flag=1 煤炭科学研究总院 北京 Beijing 煤炭科学研究总院 China Coal Research Institute 新形势下我国能源高质量发展与能源安全 2023
from urllib.request import urlopen
import json
with urlopen('https://unpkg.com/cn-atlas@0.1.2/prefectures.json') as response:
    cities = json.load(response)
import plotly.express as px

gp = orgs.groupby(['year', 'city_en'])['orgs_head'].nunique().reset_index()

fig = px.choropleth_mapbox(
    data_frame=gp,
    geojson=cities,
    color='orgs_head',
    locations="city_en",
    featureidkey="properties.name",
    mapbox_style="carto-positron",
    color_continuous_scale='dense',
    center={"lat": 37.110573, "lon": 106.493924},
    zoom=3,
)

fig.update_layout(
    layout,
    title='Organizations by City, 2013-2023',
    width=800,
    height=500,
    coloraxis_colorbar=dict(title='Count'),
    margin=dict(r=10, l=10, t=50, b=10)
)

fig.update_traces(
    marker_line_width=1,
    marker_line_color='black',
    hovertemplate='City: %{location}<br>Organizations: %{z}<extra></extra>'
)

fig.show()
import plotly.express as px

gp = orgs['city_en'].value_counts().reset_index()

fig = px.choropleth_mapbox(
    data_frame=gp,
    geojson=cities,
    color='count',
    locations="city_en",
    featureidkey="properties.name",
    mapbox_style="carto-positron",
    color_continuous_scale='dense',
    center={"lat": 37.110573, "lon": 106.493924},
    zoom=3,
)

fig.update_layout(
    layout,
    title='Total Number of Affiliations by City, 2013-2023',
    width=800,
    height=500,
    coloraxis_colorbar=dict(title='Count'),
    margin=dict(r=10, l=10, t=50, b=10)
)

fig.update_traces(
    marker_line_width=1,
    marker_line_color='black',
    hovertemplate='City: %{location}<br>Organizations: %{z}<extra></extra>'
)

fig.show()

Collaboration Network

# keep only the rows where the number of unique values in the 'orgs_head' column, grouped by the 'title_cn' column, is greater than 1
orgs_filtered = orgs[orgs.groupby(
    'title_cn')['orgs_head'].transform('nunique') > 1]
orgs_filtered = orgs_filtered[['year', 'orgs_head', 'title_cn']]
gp = orgs_filtered.groupby(
    ['year', 'title_cn']).orgs_head.unique().reset_index()
from itertools import combinations

# initialize list to hold collaboration pairs
collaboration_pairs = []

# iterate through the grouped data
for idx, row in gp.iterrows():
    orgs = row['orgs_head']
    title = row['title_cn']
    year = row['year']
    # generate all possible unique pairs of organizations
    if len(orgs) > 1:
        for pair in combinations(orgs, 2):
            collaboration_pairs.append((year, title, pair[0], pair[1]))

# create a df from the pairs
collaboration_df = pd.DataFrame(collaboration_pairs, columns=[
                                'year', 'title_cn', 'org1', 'org2'])
collaboration_df.head()
year title_cn org1 org2
0 2013 中国地理信息系统的发展与展望 Institute of Geographic Sciences and Natural Resources Research, CAS Institute of Remote Sensing and Digital Earth, CAS
1 2013 中国地理信息系统的发展与展望 Institute of Geographic Sciences and Natural Resources Research, CAS National University of Defense Technology
2 2013 中国地理信息系统的发展与展望 Institute of Geographic Sciences and Natural Resources Research, CAS Wuhan University
3 2013 中国地理信息系统的发展与展望 Institute of Remote Sensing and Digital Earth, CAS National University of Defense Technology
4 2013 中国地理信息系统的发展与展望 Institute of Remote Sensing and Digital Earth, CAS Wuhan University
# calculate collaboration strength (=number of occurrences for a pair)
# group by year and organization pair, then count collaborations
collaboration_strength = collaboration_df.groupby(
    ['year', 'org1', 'org2']).size().reset_index(name='strength')

# ensure each pair appears only once per year ((A, B) = (B, A))
collaboration_strength['org_pair'] = collaboration_strength.apply(
    lambda row: tuple(sorted([row['org1'], row['org2']])), axis=1)
collaboration_strength = collaboration_strength.groupby(
    ['year', 'org_pair'])['strength'].sum().reset_index()

# split the org_pair back into separate columns
collaboration_strength[['org1', 'org2']] = pd.DataFrame(
    collaboration_strength['org_pair'].tolist(), index=collaboration_strength.index)

# drop the org_pair column
collaboration_strength = collaboration_strength.drop('org_pair', axis=1)

# sort
collaboration_strength = collaboration_strength.sort_values(
    ['year', 'strength'], ascending=[True, False])

# reset the index
collaboration_strength = collaboration_strength.reset_index(drop=True)
collaboration_strength.sort_values(by='strength', ascending=False).head(10)
year strength org1 org2
2009 2022 39 Institutes of Science and Development, CAS University of CAS
1354 2020 29 Institutes of Science and Development, CAS University of CAS
2350 2023 22 Institutes of Science and Development, CAS University of CAS
1355 2020 17 Institute of Geographic Sciences and Natural Resources Research, CAS University of CAS
99 2016 16 Institute of Geographic Sciences and Natural Resources Research, CAS University of CAS
1636 2021 15 Institutes of Science and Development, CAS University of CAS
1111 2019 14 Institutes of Science and Development, CAS University of CAS
100 2016 13 CAS Institute of Geographic Sciences and Natural Resources Research, CAS
1637 2021 13 CAS University of CAS
1356 2020 10 CAS University of CAS
import networkx as nx

G = nx.Graph()

# Add edges with weights
for _, row in collaboration_strength.iterrows():
    G.add_edge(row['org1'], row['org2'], weight=row['strength'])

# Calculate total strength for each organization
org_strength = {}
for org in G.nodes():
    org_strength[org] = sum(G[org][neighbor]['weight'] for neighbor in G[org])

# Set position layout
pos = nx.spring_layout(G, k=0.5, iterations=50)

# Create edge trace
edge_x = []
edge_y = []
edge_widths = []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])
    # Calculate edge width based on the strength of collaboration
    edge_width = edge[2]['weight'] * 0.3  # Adjust this multiplier as needed
    edge_widths.append(edge_width)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(color='#0E86D4', width=1),
    hoverinfo='none',
    mode='lines')

# Create node trace
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='Blues',
        reversescale=False,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Connections',
            xanchor='left',
            titleside='right',
            title_font_family="Verdana",
            tickfont_family="Verdana"
        ),
        line_width=2))

# Color node points by the number of connections
node_adjacencies = []
node_text = []
for node, adjacencies in G.adjacency():
    num_connections = len(adjacencies)
    node_adjacencies.append(num_connections)
    node_text.append(
        f'<span style="font-family: Verdana;">{node}<br>Connections: {num_connections}</span>')

node_trace.marker.color = node_adjacencies
node_trace.text = node_text

network_density = nx.density(G)

fig = go.Figure()

# Add edge traces with varying widths
for i in range(len(edge_x) // 3):
    fig.add_trace(go.Scatter(
        x=edge_x[i*3:(i+1)*3], y=edge_y[i*3:(i+1)*3],
        line=dict(width=edge_widths[i], color='#0E86D4'),
        hoverinfo='none',
        mode='lines'
    ))

fig.add_trace(node_trace)

fig.update_layout(
    template='plotly_white',
    width=900,
    height=600,
    title='Collaboration Network, 2013-2023',
    titlefont=dict(family="Verdana", size=16),
    showlegend=False,
    hovermode='closest',
    margin=dict(b=20, l=5, r=5, t=40),
    annotations=[dict(
        text=f"Number of organizations: {len(G.nodes())}. Network Density: {network_density:.2%}",
        showarrow=False,
        xref="paper", yref="paper",
        x=0.005, y=-0.002,
        font=dict(family="Verdana")
    )],
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    font=dict(family="Verdana")
)

fig.show()
def process_data(collaboration_strength):
    collaboration_strength['year'] = pd.to_datetime(
        collaboration_strength['year'], format='%Y')
    years = sorted(collaboration_strength['year'].unique())
    graphs_by_year = {}
    all_nodes = set()

    for year in years:
        year_data = collaboration_strength[collaboration_strength['year'] == year]
        G = nx.Graph()

        for _, row in year_data.iterrows():
            G.add_edge(row['org1'], row['org2'], weight=row['strength'])
            all_nodes.add(row['org1'])
            all_nodes.add(row['org2'])

        graphs_by_year[year] = G

    return graphs_by_year, all_nodes, years


def plot_graph(G, pos, fig):
    edge_x, edge_y = [], []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(color='#0E86D4', width=1),
        hoverinfo='none',
        mode='lines'
    )

    node_x, node_y = [], []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    node_adjacencies = [len(list(G.adj[node])) for node in G.nodes()]
    node_text = [f'{node}<br>Connections: {adj}' for node,
                 adj in zip(G.nodes(), node_adjacencies)]

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='Blues',
            reversescale=False,
            color=node_adjacencies,
            size=10,
            colorbar=dict(thickness=15, title='Connections',
                          xanchor='left', titleside='right'),
            line_width=2
        ),
        text=node_text
    )

    fig.add_trace(edge_trace)
    fig.add_trace(node_trace)

    return fig


def create_visualization(collaboration_strength):
    graphs_by_year, all_nodes, years = process_data(collaboration_strength)

    # Create a combined graph for consistent node positioning
    G_combined = nx.Graph()
    for G in graphs_by_year.values():
        G_combined = nx.compose(G_combined, G)

    pos = nx.spring_layout(G_combined, k=0.6, iterations=50)

    fig = go.Figure()

    for year in years:
        fig = plot_graph(G=graphs_by_year[year], pos=pos, fig=fig)

    # Set visibility for traces
    visibility = [False] * len(fig.data)
    visibility[0] = True
    fig.update_traces(visible=False)
    fig.data[0].visible = True
    fig.data[1].visible = True

    steps = []
    for i, year in enumerate(years):
        year_str = year.strftime('%Y')
        visibility = [False] * len(fig.data)
        start_idx = i * 2
        end_idx = (i + 1) * 2
        for j in range(start_idx, end_idx):
            visibility[j] = True

        network_density = nx.density(graphs_by_year[year])

        step = dict(
            method="update",
            args=[
                {"visible": visibility},
                {"annotations": [dict(
                    text=f"Number of organizations: {len(graphs_by_year[year].nodes())}<br>Network Density: {network_density:.2%}",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002,
                    font=dict(family="Verdana")
                )]}
            ],
            label=f"{year_str}"
        )
        steps.append(step)

    initial_network_density = nx.density(graphs_by_year[years[0]])

    fig.update_layout(
        template='plotly_white',
        width=900,
        height=600,
        title='Collaboration Network by Year, 2013-2023',
        titlefont=dict(family="Verdana", size=16),
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=5, r=5, t=40),
        annotations=[dict(
            text=f"Number of organizations: {len(graphs_by_year[years[0]].nodes())}<br>Network Density: {initial_network_density:.2%}",
            showarrow=False,
            xref="paper", yref="paper",
            x=0.005, y=-0.002,
            font=dict(family="Verdana")
        )],
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        font=dict(family="Verdana"),
        sliders=[{
            "active": 0,
            "currentvalue": {"visible": False},
            "pad": {"t": 50},
            "steps": steps
        }],
    )

    return fig


fig = create_visualization(collaboration_strength)
fig.show()

Authors

Top Authors

authors_past_decade = authors_flat[authors_flat['year'] > 2012]
authors_past_decade.author_cn.value_counts().reset_index().head(10)
author_cn count
0 not_specified 1735
1 本刊编辑部 42
2 白春礼 36
3 本刊特约评论员 29
4 潘教峰 27
5 郭华东 21
6 樊杰 20
7 李晓轩 13
8 王竑晟 13
9 汪寿阳 13
# list of authors to exclude
exclude_list = ['not_specified', '本刊编辑部', '本刊特约评论员', '《中国科学院院刊》编辑部']
authors_past_decade = authors_past_decade.drop(
    authors_past_decade[authors_past_decade['author_cn'].isin(exclude_list)].index)

# top authors
authors_past_decade.author_cn.value_counts().reset_index().head(10)
author_cn count
0 白春礼 36
1 潘教峰 27
2 郭华东 21
3 樊杰 20
4 王竑晟 13
5 李晓轩 13
6 汪寿阳 13
7 傅伯杰 12
8 陈凯华 12
9 张凤 12
# Total Views per Author
authors_past_decade.groupby('author_cn')['views'].sum(
).sort_values(ascending=False).reset_index().head(10)
author_cn views
0 钟少颖 126378
1 张学成 119748
2 白春礼 103603
3 郭华东 84128
4 樊杰 70560
5 潘教峰 57249
6 骆永明 44810
7 周桔 36632
8 牛文元 34571
9 李宇 34096
# Calculate Average Views per Publication

author_views_avg = authors_past_decade.groupby(
    'author_cn')['views'].sum().reset_index()

authors_past_decade_count = authors_past_decade.author_cn.value_counts().reset_index()
mapping_dict = dict(
    zip(authors_past_decade_count['author_cn'], authors_past_decade_count['count']))
author_views_avg['count'] = author_views_avg['author_cn'].map(mapping_dict)

author_views_avg['views_avg'] = author_views_avg['views'] / \
    author_views_avg['count']
author_views_avg.sort_values(by='views_avg', ascending=False).head(10)
author_cn views count views_avg
1294 张学成 119748 1 119748.0
3532 钟少颖 126378 3 42126.0
2285 段培君 Robert G. Eccles 16555 1 16555.0
2730 王长林 16136 1 16136.0
2871 童庆禧 12675 1 12675.0
956 姜景山 12555 1 12555.0
695 厉为 12165 1 12165.0
1855 李安 12165 1 12165.0
2464 王万玉 12165 1 12165.0
4031 黄鹏 12165 1 12165.0
# authors with more than one publication
author_views_avg[author_views_avg['count'] > 1].sort_values(
    by='views_avg', ascending=False).head(10)
author_cn views count views_avg
3532 钟少颖 126378 3 42126.0
3732 陈运法 17036 2 8518.0
3689 陈梦舫 16338 2 8169.0
516 刘建波 15726 2 7863.0
1558 徐波 14381 2 7190.5
931 姚利 13445 2 6722.5
2704 王莉莉 13445 2 6722.5
3383 邱玉宝 25018 4 6254.5
3715 陈田 18741 3 6247.0
2603 王新明 12210 2 6105.0
author_views_avg.views_avg.describe().reset_index()
index views_avg
0 count 4066.000000
1 mean 2504.864212
2 std 2530.546228
3 min 247.000000
4 25% 1410.000000
5 50% 2325.000000
6 75% 3320.625000
7 max 119748.000000

Fund Projects

fund_projects = pd.read_csv('data/fund_projects_flat.csv')
fund_projects.shape
(1742, 4)
# filter out 2024
fund_projects = fund_projects[fund_projects['year'] < 2024]
fund_projects.shape
(1682, 4)
fund_projects.fund_project.nunique()
868
gp = fund_projects.groupby('year')['fund_project'].count().reset_index()
gp = gp[gp['fund_project'] > 0]

fig = px.bar(gp,
             x='year',
             y='fund_project'
             )

fig.update_layout(
    layout,
    title='Number of Fund Projects by Year',
    xaxis=dict(
        title="",
        type='category',
    ),
    yaxis=dict(title="Fund Projects"),
    margin=dict(r=50, l=50, b=0, t=50)
)

fig.update_traces(
    textposition='outside',
    texttemplate='%{y}',
    textfont=dict(color='black', size=14, family='Verdana'),
    marker_color='#0E86D4',
    opacity=0.7,
    hovertemplate='Year: %{x}<br>Fund Projects: %{y}<extra></extra>'
)

fig.show()
gp = df[df['year'] < 2024].groupby('year')['title_cn'].count().reset_index()
fund_count = df.groupby('year')['fund_project'].count().reset_index()
gp = gp.merge(fund_count, on='year', how='left')
gp = gp[gp['fund_project'] > 0]
gp['fund_per_article'] = gp['fund_project'] / gp['title_cn']

fig = px.bar(gp, 
             x='year', 
             y='fund_per_article', 
             )

fig.update_layout(
    layout,
    #title='Number of Fund Projects by Year',
    xaxis=dict(
        title="",
        type='category',
        ),
    yaxis=dict(title="Fund Projects"),
)

fig.update_traces(
    textposition='outside', 
    texttemplate='%{y:.0%}',  
    textfont=dict(color='black', size=14, family='Verdana'),
    marker_color='#0E86D4',  
    opacity=0.7,
    hovertemplate='Year: %{x}<br>Fund Projects per Article: %{y:.2f}<extra></extra>'
)

fig.show()

Keywords

Article Description Keywords

keywords = df[['year', 'title_cn', 'keywords_cn']].copy()
keywords = keywords.dropna(subset=['keywords_cn'])
keywords.head()
year title_cn keywords_cn
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略
170 2023 我国煤炭主体能源安全高质量发展的理论技术思考 能源安全,煤炭智能精准开采,清洁高效利用,碳中和科学发展
171 2023 新形势下我国能源高质量发展与能源安全 能源安全,高质量发展,综合能源保障体系,全方位安全观,能源与矿业治理
172 2023 页岩油开发利用及在能源中的作用 页岩油,能源安全,开发利用,能源体系,政策建议,中国
173 2023 碳中和目标下中国新能源使命 碳达峰,碳中和,碳中和学,新能源,能源转型,能源独立,碳中和社会
keywords.shape
(4962, 3)
import regex as re
# replace punctuations with comas
keywords.keywords_cn = (keywords.keywords_cn.str.replace('[,,;;()!!~]', ',', regex=True)
                                            .str.replace(r',+', ',', regex=True))


# flatten the keywords list
keywords['keywords_list'] = keywords.keywords_cn.str.split(',')
keywords_flat = keywords.explode('keywords_list').rename(
    columns={'keywords_list': 'keyword'})
keywords_flat.keyword = (keywords_flat.keyword.str.rstrip(' ')
                         .str.lstrip(' '))
keywords_flat.head()
year title_cn keywords_cn keyword
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 碳达峰
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 碳中和
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 油气安全
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 关系
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 路径
keywords_stats = keywords_flat['keyword'].value_counts(
).reset_index().sort_values(ascending=False, by='count')


def get_keywords_stats(keywords_stats):
    keywords_stats['share'] = 100. * \
        keywords_stats['count']/keywords_stats['count'].sum()
    keywords_stats['cumulative_share'] = 100. * \
        keywords_stats['count'].cumsum()/keywords_stats['count'].sum()


get_keywords_stats(keywords_stats)
keywords_stats.head(25)
keyword count share cumulative_share
0 中国科学院 820 3.060387 3.060387
1 基础研究 107 0.399343 3.459730
2 可持续发展 103 0.384414 3.844144
3 科学家 83 0.309771 4.153915
4 研究所 83 0.309771 4.463686
5 中国科学院院士 81 0.302306 4.765992
6 建议 76 0.283646 5.049638
7 学部委员 75 0.279913 5.329551
8 国际合作 72 0.268717 5.598268
9 中国 70 0.261253 5.859521
11 科技创新 59 0.220199 6.079719
10 中国科学院学部 59 0.220199 6.299918
12 知识创新工程 53 0.197805 6.497723
13 科学技术 53 0.197805 6.695529
14 气候变化 51 0.190341 6.885870
15 院士 49 0.182877 7.068747
16 国家重点实验室 49 0.182877 7.251623
17 中科院 48 0.179145 7.430768
18 创新 47 0.175412 7.606180
19 对策 44 0.164216 7.770396
21 研究成果 40 0.149287 7.919684
20 发展 40 0.149287 8.068971
22 青藏高原 37 0.138091 8.207061
27 碳中和 36 0.134358 8.341420
29 改革 36 0.134358 8.475778
keywords_stats['count'].describe()
count    14530.000000
mean         1.844047
std          7.613227
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        820.000000
Name: count, dtype: float64
fig = px.bar(keywords_stats.head(20), x='count', y='keyword', orientation='h')

fig.update_layout(
    layout,
    title='Top 20 Keywords between 2013-2023',
    width=700,
    height=600,
    xaxis=dict(range=[0, keywords_stats['count'].max() * 1.1]),
    yaxis=dict(
        title="Word Count",
        autorange='reversed'
    ),
    margin=dict(r=50, l=50, b=0, t=50)
)

fig.update_traces(
    textposition='outside',
    texttemplate='%{x}',
    textfont=dict(color='black'),
    marker_color='#0E86D4',
    opacity=0.7,
    hovertemplate='Keyword: %{y}<br>Count: %{x}<extra></extra>'
)

fig.show()
NameError: name 'px' is not defined
keywords_top_year = keywords_flat.groupby(
    'year')['keyword'].value_counts().reset_index()
keywords_top_year
year keyword count
0 1986 中国科学院 26
1 1986 研究所 7
2 1986 基础研究 5
3 1986 平装 4
4 1986 新书 4
... ... ... ...
21843 2023 高原湖泊 1
21844 2023 高影响专利 1
21845 2023 高被引论文 1
21846 2023 高质量 1
21847 2023 黄土高原 1

21848 rows × 3 columns

from plotly.subplots import make_subplots

years = list(range(2013, 2024))

fig = make_subplots(
    rows=6,
    cols=2,
    subplot_titles=[f'Top 10 Keywords in {year}' for year in years],
    vertical_spacing=0.05,
    horizontal_spacing=0.21
)

for i, year in enumerate(years):
    gp = (keywords_flat[keywords_flat['year'] == year].keyword.value_counts(
        normalize=True) * 100).reset_index().head(10)

    fig.add_trace(
        go.Bar(
            x=gp['proportion'],
            y=gp['keyword'],
            orientation='h',
            marker_color='#0E86D4',
            opacity=0.7,
            text=gp['proportion'].apply(lambda x: f'{x:.2f}%'),
            textposition='outside',
            textfont=dict(color='black'),
            hovertemplate='Keyword: %{y}<br>Share: %{x:.2f}%<extra></extra>'
        ),
        row=(i // 2) + 1, col=(i % 2) + 1  # Calculate row and column
    )

    fig.update_xaxes(title_text="Share", range=[
                     0, 4.5], showline=True, linecolor='black')
    fig.update_yaxes(autorange="reversed", ticklabelposition="outside", row=(
        i // 2) + 1, col=(i % 2) + 1, showline=True, linecolor='black')

fig.update_layout(
    layout,
    height=2000,
    width=800,
    showlegend=False,
)

fig.show()

Jieba Tokenization

import jieba


# clean and preprocess Chinese with jieba
def tokenize(text):

    # regular expression to remove non-Chinese characters
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    text = re.sub(pattern, '', text)

    # tokenize
    words = jieba.cut(text, cut_all=False)
    words = [w for w in words]
    return ' '.join(words)
keywords['keywords_tokenized'] = keywords.keywords_cn.apply(tokenize)
Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/fm/r7lztspd5m77mjytcdb5636w0000gn/T/jieba.cache
Loading model cost 0.442 seconds.
Prefix dict has been built successfully.
keywords[['keywords_cn', 'keywords_tokenized']].head()
keywords_cn keywords_tokenized
169 碳达峰,碳中和,油气安全,关系,路径,战略 碳达峰 碳 中 和 油气 安全 关系 路径 战略
170 能源安全,煤炭智能精准开采,清洁高效利用,碳中和科学发展 能源安全 煤炭 智能 精准 开采 清洁 高效 利用 碳中 和 科学 发展
171 能源安全,高质量发展,综合能源保障体系,全方位安全观,能源与矿业治理 能源安全 高质量 发展 综合 能源 保障体系 全方位 安全观 能源 与 矿业 治理
172 页岩油,能源安全,开发利用,能源体系,政策建议,中国 页岩 油 能源安全 开发利用 能源 体系 政策 建议 中国
173 碳达峰,碳中和,碳中和学,新能源,能源转型,能源独立,碳中和社会 碳达峰 碳 中 和 碳 中和学 新能源 能源 转型 能源 独立 碳中 和 社会
from sklearn.feature_extraction.text import CountVectorizer


# create a CountVectorizer object
vectorizer = CountVectorizer()

# fit and transform the data
X = vectorizer.fit_transform(keywords['keywords_tokenized'])

# Get the word frequencies
jieba_word_counts = pd.DataFrame(
    X.sum(axis=0), columns=vectorizer.get_feature_names_out()).T
jieba_word_counts = jieba_word_counts.sort_values(
    ascending=False, by=0).reset_index()
jieba_word_counts = jieba_word_counts.rename(
    columns={'index': 'keyword', 0: 'count'})
def get_keywords_stats(jieba_word_counts):
    jieba_word_counts['share'] = jieba_word_counts['count'] / \
        jieba_word_counts['count'].sum() * 100
    jieba_word_counts['cumulative_share'] = jieba_word_counts['count'].cumsum(
    )/jieba_word_counts['count'].sum() * 100
    return jieba_word_counts[['keyword', 'count', 'share', 'cumulative_share']]


jieba_word_counts_stats = get_keywords_stats(jieba_word_counts)
jieba_word_counts_stats.head(25)
keyword count share cumulative_share
0 中国科学院 991 2.219833 2.219833
1 研究 823 1.843514 4.063347
2 发展 728 1.630715 5.694062
3 科技 647 1.449275 7.143337
4 创新 535 1.198396 8.341733
5 科学 534 1.196156 9.537889
6 技术 417 0.934077 10.471966
7 研究所 342 0.766078 11.238044
8 国家 314 0.703358 11.941402
9 中国 277 0.620478 12.561880
10 战略 271 0.607038 13.168918
11 生态 265 0.593598 13.762516
12 合作 262 0.586878 14.349394
13 国际 256 0.573438 14.922832
14 工程 256 0.573438 15.496270
15 院士 232 0.519678 16.015949
16 生物 230 0.515198 16.531147
17 实验室 222 0.497278 17.028426
18 工作 217 0.486078 17.514504
19 基础 213 0.477118 17.991622
20 持续 212 0.474878 18.466501
21 经济 200 0.447999 18.914499
22 科学技术 185 0.414399 19.328898
23 物理 184 0.412159 19.741057
24 社会 181 0.405439 20.146496
fig = px.bar(
    jieba_word_counts_stats.head(20),
    x='count',
    y='keyword',
    orientation='h'
)

fig.update_layout(
    layout,
    title='Top 20 Tokens between 2013-2023',
    width=700,
    height=600,
    xaxis=dict(
        range=[0, jieba_word_counts_stats['count'].max() * 1.1],
        title="Token Count"),
    yaxis=dict(
        autorange='reversed'
    ),
    margin=dict(r=50, l=50, b=0, t=50)
)

fig.update_traces(
    textposition='outside',
    texttemplate='%{x}',
    textfont=dict(color='black'),
    marker_color='#0E86D4',
    opacity=0.7,
    hovertemplate='Token: %{y}<br>Count: %{x}<extra></extra>'
)
fig.show()
keywords.keywords_tokenized = keywords.keywords_tokenized.apply(
    lambda x: x.split())
keywords.head()
year title_cn keywords_cn keywords_list keywords_tokenized
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] [碳达峰, 碳, 中, 和, 油气, 安全, 关系, 路径, 战略]
170 2023 我国煤炭主体能源安全高质量发展的理论技术思考 能源安全,煤炭智能精准开采,清洁高效利用,碳中和科学发展 [能源安全, 煤炭智能精准开采, 清洁高效利用, 碳中和科学发展] [能源安全, 煤炭, 智能, 精准, 开采, 清洁, 高效, 利用, 碳中, 和, 科学, 发展]
171 2023 新形势下我国能源高质量发展与能源安全 能源安全,高质量发展,综合能源保障体系,全方位安全观,能源与矿业治理 [能源安全, 高质量发展, 综合能源保障体系, 全方位安全观, 能源与矿业治理] [能源安全, 高质量, 发展, 综合, 能源, 保障体系, 全方位, 安全观, 能源, 与, 矿业, 治理]
172 2023 页岩油开发利用及在能源中的作用 页岩油,能源安全,开发利用,能源体系,政策建议,中国 [页岩油, 能源安全, 开发利用, 能源体系, 政策建议, 中国] [页岩, 油, 能源安全, 开发利用, 能源, 体系, 政策, 建议, 中国]
173 2023 碳中和目标下中国新能源使命 碳达峰,碳中和,碳中和学,新能源,能源转型,能源独立,碳中和社会 [碳达峰, 碳中和, 碳中和学, 新能源, 能源转型, 能源独立, 碳中和社会] [碳达峰, 碳, 中, 和, 碳, 中和学, 新能源, 能源, 转型, 能源, 独立, 碳中, 和, 社会]
keywords_jieba_flat = keywords.explode('keywords_tokenized').rename(
    columns={'keywords_tokenized': 'keyword_jieba'})
keywords_jieba_flat.head()
year title_cn keywords_cn keywords_list keyword_jieba
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] 碳达峰
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略]
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略]
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略]
169 2023 油气安全战略与“双碳”战略:关系与路径 碳达峰,碳中和,油气安全,关系,路径,战略 [碳达峰, 碳中和, 油气安全, 关系, 路径, 战略] 油气
from plotly.subplots import make_subplots

years = list(range(2013, 2024))

fig = make_subplots(
    rows=6,
    cols=2,
    subplot_titles=[f'Top 10 Tokens in {year}' for year in years],
    vertical_spacing=0.05,
    horizontal_spacing=0.2
)

for i, year in enumerate(years):
    gp = (keywords_jieba_flat[keywords_jieba_flat['year'] == year].keyword_jieba.value_counts(
        normalize=True)*100).reset_index().head(10)
    fig.add_trace(
        go.Bar(
            x=gp['proportion'],
            y=gp['keyword_jieba'],
            orientation='h',
            marker_color='#0E86D4',
            opacity=0.7,
            text=gp['proportion'].apply(lambda x: f'{x:.2f}%'),
            textposition='outside',
            textfont=dict(color='black'),
            hovertemplate='Keyword: %{y}<br>Share: %{x:.2f}%<extra></extra>'
        ),
        row=(i // 2) + 1, col=(i % 2) + 1  # Calculate row and column
    )

    fig.update_xaxes(title_text="Share", range=[
                     0, 4.5], showline=True, linecolor='black')
    fig.update_yaxes(autorange="reversed", ticklabelposition="outside", row=(
        i // 2) + 1, col=(i % 2) + 1, showline=True, linecolor='black')

fig.update_layout(
    layout,
    height=2000,
    width=800,
    showlegend=False,
)

fig.show()
# get the index of the keyword where cumulative_share >= 80%

for index, row in jieba_word_counts_stats.iterrows():
    if row['cumulative_share'] >= 80:
        keyword_index = index + 1
        break
top_words_share = keyword_index / jieba_word_counts_stats.keyword.nunique() * \
    100
print(
    f'Share of Words that make up 80% of words: {round(top_words_share, 2)}%')
Share of Words that make up 80% of words: 21.48%
jieba_word_counts_stats.head()
keyword count share cumulative_share
0 中国科学院 991 2.219833 2.219833
1 研究 823 1.843514 4.063347
2 发展 728 1.630715 5.694062
3 科技 647 1.449275 7.143337
4 创新 535 1.198396 8.341733
fig = px.line(jieba_word_counts_stats, x=jieba_word_counts_stats.keyword.index,
              y='cumulative_share', markers=True)

fig.update_layout(
    layout,
    yaxis=dict(title="Cumulative Share"),
    xaxis=dict(title="Word Index", minor=dict(showgrid=True),),
    title=f'Cumulative Distribution of Tokens',
    margin=dict(r=50, l=50, b=0, t=50)
)

fig.update_traces(
    marker_color='#0E86D4',
    opacity=0.7
)

fig.show()
fig = px.scatter(jieba_word_counts_stats, x=jieba_word_counts_stats.keyword.index, y="count")

fig.update_layout(
    layout,
    yaxis=dict(title="Token Frequency"),
    xaxis=dict(title="Token Index", minor=dict(showgrid=True),),
)

fig.update_traces(
    marker_color='#0E86D4',
    opacity=0.7,
    hovertemplate='Frequency: %{y}<br>Index: %{x}<extra></extra>'
)

fig.show()

Wordcloud

from PIL import Image
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Sample DataFrame
font_path = '/Library/Fonts/STHeiti Light.ttc'
#mask_image = np.array(Image.open('/Users/dmitrijmazanik/Downloads/China-Map-PNG-Pic.png'))

# Flatten the list of tokenized words into a single string
all_words = ' '.join(keywords['keywords_tokenized'].sum())

# Create the word cloud
wc = WordCloud(
    #mask=mask_image,
    font_path=font_path,
    max_words=1500,
    max_font_size=100,
    random_state=42,
    width=800, height=1000,

    contour_width=1,
    background_color="white",  # Set to None to utilize mask colors
    colormap='PuBu',  # You can choose a colormap that fits your design
).generate(all_words)

#image_colors_default = ImageColorGenerator(mask_image)

# Display the word cloud
plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')  # Turn off axis
plt.show()

Back to top